MacTech 1 to 12

home *** CD-ROM | disk | FTP | other *** search

/ MacTech 1 to 12 / MacTech-vol-1-12.toast / Source / MacTech® Magazine / Volume 10 - 1994 / 10.04 Apr 94 / Programmers' Challenge / anagram.c

Wrap

C/C++ Source or Header | 1994-02-13 | 8.8 KB | 316 lines | [TEXT/KAHL]

/* Anagram Programmer's Challenge by Larry Landry This implementation uses a large amount of memory to optimize the CPU utilization. To guarantee that we have enough memory for all matching words, we actually allocate an array of pointers for 30,000 words. Since the rules stated that there would be about 20,000 words in the dictionary, even if every word matched, we would still have enough storage. In reality this number could probably be less than 1-200 for all but the most rare of scenarios. The basic algorithm is: 1) Convert the input string into a table of counts for each character from a-z. So "sammy" would have a count of 2 for "m" and 1 for each of "s", "a", and "y". This makes testing for the presence of a character as simple as checking and indexed value in an array. 2) Parse through the dictionary and find the words that can be composed of some portion of characters from the input characters. Build a list of pointers to each word. The number of words in this list will be in the tens instead of thousands. 3) Recursively process the words in this list and find strings of words that use up all of the characters. For each matching sequence, output the words to the file. The processing requrired by this algorithm is then D * C1 + M * log2(M) * C2 where D = size of input dictionary M = number of matching words C1 & C2 are constants This algorithm works very well for cases where there are few words that match the input letters. The worst case scenario where all words can be made from the input letters will still take a very long time. I expect that matching words will typically be less than 100. */ #include <stdio.h> typedef unsigned char uchar; typedef unsigned short ushort; typedef unsigned long ulong; #define MAX_WORDS 30000L #define OUTPUT_BUFFER_SIZE 10000L #define RETURN '\n' typedef struct { char*fWordStart; short fWordLength; } WordLoc; /* Usage counts for each character (only indexes 'a' to 'z' are actually used) */ typedef uchar CharData[256]; unsigned long Anagram(Str255 inputText, FILE *wordList, FILE *outputFile); ulong findInputWords(register char *wordBuffer, WordLoc *validWords); ulong findAnagrams(short numValidChars, ulong wordCount, WordLoc *validWords, short prevWordCount); /* I use some global variables here to avoid passing them down into the recursive routine findAnagrams(). These values are constant once findAnagrams() is called. */ char gOutputBuffer[OUTPUT_BUFFER_SIZE]; char *gOutputBufferEnd = gOutputBuffer + OUTPUT_BUFFER_SIZE - 512; char *gOutputPtr; CharData gValidChars; WordLoc *gWordsInUse[255]; FILE *gOutputFile; unsigned long Anagram(Str255 inputText, FILE *wordList, FILE *outputFile) { fpos_t wordBufferLength; char*wordBuffer; short index; short numValidChars; WordLoc validWords[MAX_WORDS]; char ch; ulong wordCount; gOutputFile = outputFile; gOutputPtr = gOutputBuffer; /* To save on file I/O time, read the whole file all at once. First, find the length of the file by seeking the end and finding the file pos. Then allocate a buffer of that size, plus 2 bytes (for a return and NULL char) and read the data into it. Finally put the return and NULL char at the end. */ fseek(wordList, 0L, SEEK_END); fgetpos(wordList, &wordBufferLength); wordBuffer = (char*) NewPtr((Size) wordBufferLength + 2); if (wordBuffer == NULL) return 0L; /* real error handling here */ rewind(wordList); fread(wordBuffer, (size_t) 1, (size_t) wordBufferLength, wordList); if (wordBuffer[wordBufferLength-1] != RETURN) wordBuffer[wordBufferLength++] = RETURN; wordBuffer[wordBufferLength] = '\0'; /* To save time ruling out words, we build a list of the valid characters in the words. We start with no valid characters. */ for (index='a'; index<'z'; index++) gValidChars[index] = 0; /* Now build the list of valid characters. Each array entry will be a count of how many times that character is present. */ numValidChars = *inputText++; for (index=numValidChars; index>0; index--) if ((ch = *inputText++) != ' ') gValidChars[ch]++; else numValidChars--; /* Find the list of words that can be made up from the letters in the input word */ wordCount = findInputWords(wordBuffer, &validWords[MAX_WORDS-1]); /* Now find the list of full anagrams that can be created from these words */ wordCount = findAnagrams(numValidChars, wordCount, &validWords[MAX_WORDS-wordCount], 0); /* Write the results to the output */ *gOutputPtr = 0;/* Terminate the string */ fprintf(outputFile, gOutputBuffer); DisposPtr(wordBuffer); return wordCount; } /* Anagram */ ulong findInputWords(register char *wordBuffer, WordLoc *validWords) { char*saveStart = wordBuffer; ulong numberWords = 0; char ch; while (*wordBuffer) { ch = *wordBuffer++; if (ch == RETURN) { /* Record this entry as a valid word */ numberWords++; validWords->fWordStart = saveStart; validWords->fWordLength = (short)(wordBuffer - saveStart - 1); validWords--; wordBuffer--; while (saveStart < wordBuffer) gValidChars[*saveStart++]++; /* Save the new start of word pointer */ saveStart = ++wordBuffer; } else if (gValidChars[ch]) gValidChars[ch]--; else { /* This word didn't match so reset and go to the next word */ wordBuffer--; while (saveStart < wordBuffer) gValidChars[*saveStart++]++; while (*wordBuffer++ != RETURN) ; /* Save the new start of word pointer */ saveStart = wordBuffer; } /* else */ } /* while */ return numberWords; } /* findInputWords */ ulong findAnagrams(short numValidChars, ulong wordCount, WordLoc *validWords, short prevWordCount) { ulong wordIndex; ulong usedIndex; short chIndex; ulong matchCount = 0; Boolean wordFits; char ch; char*tempPtr; WordLoc *theWord; /* Try each word we have against the list of characters. */ for (wordIndex=0; wordIndex<wordCount; wordIndex++) { /* If there aren't enough characters left, it can't be a match */ if (validWords->fWordLength <= numValidChars) { /* Go through the chars in this word testing to make sure that there is at least one of each char available */ wordFits = TRUE; for (chIndex=0; chIndex<validWords->fWordLength; chIndex++) { ch = validWords->fWordStart[chIndex]; if (gValidChars[ch]) gValidChars[ch]--; else { /* Found an unavailable character, so this can't be part of the anagram. Reset the character usage array and go to the next word. */ wordFits = FALSE; while (--chIndex >= 0) gValidChars[validWords->fWordStart[chIndex]]++; break; /* get out of the for loop */ } /* else */ } /* for */ if (wordFits) { /* This word fit, so see if it uses all the characters. If so, then we have found an anagram. Output the anagram and increment the anagram count. */ if (validWords->fWordLength == numValidChars) { matchCount++; /* Copy the previous words for this anagram separated by spaces. */ for (usedIndex=0; usedIndex<prevWordCount; usedIndex++) { theWord = gWordsInUse[usedIndex]; memcpy(gOutputPtr, theWord->fWordStart, (size_t) theWord->fWordLength); gOutputPtr += theWord->fWordLength; *gOutputPtr++ = ' '; } /* for */ /* Now copy this new word and a return character */ memcpy(gOutputPtr, validWords->fWordStart, (size_t) validWords->fWordLength); gOutputPtr += validWords->fWordLength; *gOutputPtr++ = RETURN; /* To ensure that we don't overrun the output buffer check against the end of the buffer. If the end pointer has been passed, write the data to the file and reset the output pointer to the beginning of the buffer. */ if (gOutputPtr > gOutputBufferEnd) { *gOutputPtr = 0;/* Terminate the string */ fprintf(gOutputFile, gOutputBuffer); gOutputPtr = gOutputBuffer; } /* if */ } else { /* This word did fit, but didn't use all of the characters so add it to the list of previous words in the anagram and then call this procedure recursively to find if there are more words that can be added to make an anagram with this base. */ gWordsInUse[prevWordCount] = validWords; matchCount += findAnagrams( numValidChars - validWords->fWordLength, wordCount - wordIndex, validWords, prevWordCount + 1); } /* else */ /* Now undo the characters we took out of the validChar array */ for (chIndex=0; chIndex<validWords->fWordLength; chIndex++) gValidChars[validWords->fWordStart[chIndex]]++; } /* if */ } /* if */ validWords++; } /* for */ return matchCount; } /* findAnagrams */